library(data.table)
## data.table 1.14.8 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
## **********
## This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
## This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
## **********
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
library(CellBarcode)
library(ggrepel)
library(ROCR)
library(ggsci)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
theme0 <- theme_bw() + theme(
    text = element_text(size = 15),
    line = element_line(size = 1),
    axis.line = element_line(size = 1),
    axis.ticks.length = unit(3, units = "mm"),
    axis.text.x = element_text(
        margin = margin(t = 2, unit = "mm")
        , angle = 60, vjust = 1, size = 15, hjust = 1),
    axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
    legend.position = "right",
) 
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
theme0_lt = theme0 + theme(legend.position = "top")
theme1 = theme0 + theme(
    axis.text.x = element_text( 
        margin = margin(t = 2, unit = "mm")
        , angle = 0, vjust = 1, size = 12, hjust = 0.5)
)

knitr::opts_chunk$set(fig.width=20, fig.height=12) 

Sample info

sample_info = fread("../run_simulation_no_umi/non_umi_simu_design_matrix.tsv")
sample_info$simu_id %<>% as.character
setkey(sample_info, "simu_id")

i_level_simu_name = sample_info[order(as.integer(simu_id)), simu_name][c(7, 1, 8, 23, 22, 24)]

Clone size disbribution

true_barcode_l = read_rds("../run_preprocess_simulation/tmp/non_umi_simulation_ref.rds")
d_true_barcode = true_barcode_l %>% rbindlist(idcol = "simu_file") 
x = d_true_barcode$simu_file %>% str_match("barcode_simulation_\\d+_simu_seq_(.*)") %>% extract(, c(2))
table(x, useNA = "always")
## x
##     1    10    11    12    13    14    15    16    17    18    19     2    20    21    22    23    24    25    26    27    28     3     4     5     6     7     8     9  <NA> 
##  8999  8998  8999  8998  8999  8999  8998  9000  8997  8998  8863  4499  8854  8999  6861  6869  6855 11971 20058 17997 35978 17989 35981  8998  8999  9000  8999  8996     0
d_true_barcode = cbind(d_true_barcode, simu_id = x)
# d_true_barcode[simu_name == "hamming" & grepl("simulation_19_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode" & grepl("simulation_1_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode"]
d_true_barcode$simu_name = sample_info[d_true_barcode$simu_id, simu_name]
table(d_true_barcode$simu_name, useNA = "always")
## 
##                                barcode_length_10     barcode_size_mean_3_variation_1_clone_n_1200      barcode_size_mean_3_variation_1_clone_n_600                                             base                                     clone_n_1200 
##                                             8999                                            35978                                            17997                                             8999                                            35981 
##                                      clone_n_150                                      clone_n_600                                         cycle_20                                         cycle_40                                   efficiency_0.5 
##                                             4499                                            17989                                             8996                                             8998                                             8999 
##                                   efficiency_0.9                                       error_1e-5                                       error_1e-7                                          hamming                         hamming_size_variation_3 
##                                             8998                                             8999                                             8999                                             8863                                             8854 
##                                 ngs_profile_MSv1                                     pcr_read_100                                    pcr_read_1000                                      pcr_read_25                                    size_mean_0.6 
##                                             8998                                             9000                                             8997                                             8998                                             8998 
##                                      size_mean_3                                 size_variation_1                                 size_variation_3                                      vdj_barcode vdj_barcode_size_mean_3_variation_1_clone_n_1200 
##                                             8999                                             9000                                             8999                                             6861                                            20058 
##  vdj_barcode_size_mean_3_variation_1_clone_n_600                     vdj_barcode_size_variation_1                     vdj_barcode_size_variation_3                                             <NA> 
##                                            11971                                             6869                                             6855                                                0
d_true_barcode = rename(d_true_barcode, c("seq" = "barcode_seq"))

## Barcode frequency
ggplot(d_true_barcode[, .(freq = sum(freq)), by = .(simu_file, barcode_seq, simu_name)]) + aes(x = simu_name, y = freq) +
    geom_boxplot() +
    # geom_jitter(alpha = 0.2) + 
    theme0 + scale_y_log10() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 279168 rows containing missing values (`stat_boxplot()`).

AUC

# NOTE: input
d_auc = fread("./tmp/table_auc_no_umi_reference.tsv")
d_count_true_barcode = fread("./tmp/table_count_true_barcode_reference.tsv")

count - true barcode

ggplot(d_count_true_barcode) + aes(x = simu_name, y = count, color = factor(is_true)) +
    geom_boxplot(alpha = 0.2) +
    theme0_lt + scale_y_log10() + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 804850 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).

AUC versus simu conditions

ggplot(d_auc) + aes(x = simu_name, y = auc) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "AUC") + lims(y = c(0, 1))
## Warning: Removed 660 rows containing missing values (`stat_boxplot()`).

comparisons_l = list(
    c("size_variation_1", "base"),
    c("size_variation_1", "size_variation_3"),
    c("vdj_barcode_size_variation_1", "vdj_barcode"),
    c("vdj_barcode_size_variation_1", "vdj_barcode_size_variation_3")
    )
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01,
          0.05, Inf), symbols = c("****", "***", "**",
          "*", "ns"))
d_auc = d_auc[simu_name %in% i_level_simu_name]
ggplot(d_auc) + aes(x = simu_name, y = aucpr) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "P-R AUC") + lims(y = c(0, NA)) +
    stat_compare_means(method = "wilcox.test", comparisons = comparisons_l, na.rm = T, hide.ns = N)
## Warning in wilcox.test.default(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.996666666666667, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.996666666666667, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 0.999, 1, 0.999, 1, 1, 0.999, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 0.999, 1, 0.999, 1, 1, 0.999, : cannot compute exact p-value with ties

write_tsv(d_auc, "tmp/Figure_3D.tsv")
ggsave("tmp/Figure_3D.pdf")
## Saving 20 x 12 in image
## Warning in wilcox.test.default(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.996666666666667, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.996666666666667, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 0.999, 1, 0.999, 1, 1, 0.999, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1, 1, 1, 1, 0.999, 1, 0.999, 1, 1, 0.999, : cannot compute exact p-value with ties

Resolusion index

d_count_true_barcode = fread("tmp/table_count_true_barcode_reference.tsv")

# d_count_true_barcode[, count_cat := cut(clone_size, breaks = c(0, 1, 2, 3, 4, 5, 10, 20, Inf))]
# lapply(1:100, function(i) {
# d_count_true_barcode[, sum(count < i & is_true), ,by = .(clone_size, simu_file)]
# })

Apply the autothreshold strategy

# NOTE: input
d = fread("./tmp/table_no_umi_auto_threshold_predict_rate_reference.tsv")

# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 

ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = 1 - value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

Apply the threshold of 0.0001 of left reads

# NOTE: input
d = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")

# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 

ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 1320 rows containing missing values (`stat_boxplot()`).

Merge threshold

d1 = fread("./tmp/table_no_umi_auto_threshold_predict_rate_reference.tsv")[, resolution := "auto"]
d2 = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")[, resolution := "res0.0001"]
d3 = fread("./tmp/table_no_umi_p00001_threshold_predict_rate_reference.tsv")[, resolution := "res0.00001"]

d = rbindlist(list(d1, d2, d3))
d = melt(d, id.vars = c("simu_name", "resolution"), measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d$resolution %<>% factor(levels = c("auto", "res0.0001", "res0.00001"))

ggplot(d) + aes(x = simu_name, y = value, color = resolution) +
    geom_boxplot() + 
    theme0 + facet_grid(pr ~ .) + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 3960 rows containing missing values (`stat_boxplot()`).